
# File: topic_model.R
# Purpose: This script runs a basic LDA topic model on the hearings via the text2vec package.
# Input: /Data/Hearings/converted_docs.csv
# Output: interactive plot for exploration of topics (open either in shiny or browser)

rm(list = ls())
gc()
set.seed(123)
require(text2vec)
require(textstem)
require(stopwords)
require(tidyverse)
require(textmineR)
prep_fun = function(x) {
  x = str_to_lower(x)
  x = str_replace_all(x, "'", "")
  x = str_replace_all(x, "[^[:alpha:]]", " ")
  x = str_replace_all(x, "\\s+", " ")
}

topicModel_fun <- function(train,test,text,id,ks = c(10,20,30,50,70,100,150,200,300,400,500,750,1000)) {
  it = itoken(train[[text]],ids = train[[id]],progressbar = FALSE)
  v = create_vocabulary(it,stopwords = c(gsub("'",'',stopwords())))
  v = prune_vocabulary(v,doc_proportion_max = .2,term_count_min = 20)
  vectorizer = vocab_vectorizer(v)
  dtm = create_dtm(it,vectorizer)
  tcm <- create_tcm(it,vectorizer,skip_grams_window = 20L)
  
  it = itoken(test[[text]],ids = test[[id]],progressbar = FALSE)
  dtmTest = create_dtm(it,vectorizer)
  
  perplx <- cohres <- NULL
  for(k in ks) {
    lda_model = LDA$new(n_topics = k, doc_topic_prior = 0.1, topic_word_prior = 0.01)
    doc_topic_distr = lda_model$fit_transform(x = dtm, n_iter = 1000,convergence_tol = 0.001, n_check_convergence = 25,progressbar = FALSE)
    tw = lda_model$get_top_words(n = 10, lambda = 1)
    
    # Coherence
    cohTmp <- coherence(tw, tcm, n_doc_tcm = attr(v, 'document_count'))
    cohres <- bind_rows(data.frame(cohTmp) %>%
                          mutate(topic = row.names(.),
                                 k = k),cohres)
    
    # Perplexity
    new_doc_topic_distr = lda_model$transform(dtmTest)
    perplx <- bind_rows(perplx,data.frame(k = k,perplexity = perplexity(dtmTest, topic_word_distribution = lda_model$topic_word_distribution, doc_topic_distribution = new_doc_topic_distr)))
    cat(k,'\n')
  }
  return(list(perplexity = perplx,
              coherence = cohres))
}

# Topic model estimated on speaker-hearing concatenated text (JOP RR1)
load('../data/finalData.RData')

text <- utterance_level %>%
  select(docID,fullInd,opensecretsID,textclean,nchars) %>%
  group_by(opensecretsID) %>%
  arrange(opensecretsID,fullInd) %>%
  mutate(over = nchars > 1000) %>%
  group_by(opensecretsID) %>%
  mutate(rwID = cumsum(over != lag(over, default = over[1]))) %>%
  group_by(opensecretsID,rwID) %>%
  mutate(cumchars = cumsum(nchars)) %>%
  ungroup() %>%
  mutate(group_id = cumsum(cumchars > 1000) + 1) %>%
  mutate(rwID = ifelse(lag(cumchars) < 1000,lag(rwID),rwID)) %>%
  group_by(opensecretsID,rwID) %>%
  mutate(combText = paste(textclean,collapse = '. ')) %>%
  mutate(rwID = ifelse(is.na(rwID),-1,rwID)) %>%
  ungroup()

text2 <- text %>%
  ungroup() %>%
  select(docID,opensecretsID,rwID,combText) %>%
  distinct() %>%
  mutate(textclean = lemmatize_strings(prep_fun(combText))) %>%
  filter(!is.na(opensecretsID)) %>%
  mutate(nchars = nchar(textclean)) %>%
  mutate(uniqueID = paste0(docID,opensecretsID,rwID)) %>%
  select(uniqueID,textclean)


inds <- sample(1:nrow(text2),size = round(nrow(text2)*.8),replace = F)
train <- text2 %>%
  slice(inds)
test <- text2 %>%
  slice(-inds)

tmRes_Grped <- topicModel_fun(train,test,text = 'textclean',id = 'uniqueID')


save(tmRes_Grped,file = '../output/topic_robustness_JOPRR1_Grped.RData')

tmRes_Grped$perplexity %>%
  ggplot(aes(x = k,y = perplexity)) + 
  geom_point() + 
  geom_line() + 
  geom_text(aes(label = k))

tmRes_Grped$coherence %>%
  # filter(topic == 'topic_1') %>%
  ggplot(aes(x = factor(k),y = mean_logratio)) + 
  geom_violin()


pdf('../output/figures/coherence_SI.pdf',width = 7,height = 5)
tmRes_Grped$coherence %>%
  filter(k > 20) %>%
  filter(!is.infinite(mean_logratio)) %>%
  group_by(k) %>%
  # slice_max(mean_logratio,n = 100) %>%
  summarise_all(mean,na.rm=T) %>%
  ggplot(aes(x = k,y = mean_logratio)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = c(70)) + 
  ylab('Coherence (avg. log ratio)')
dev.off()


# Choosing the best
it = itoken(text2$textclean,ids = text2$uniqueID,progressbar = FALSE)
v = create_vocabulary(it,ngram = c(1,2),stopwords = c(gsub("'",'',stopwords())))
v = prune_vocabulary(v,doc_proportion_max = .5,term_count_min = 10)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it,vectorizer)
lda_model = LDA$new(n_topics = 70, doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr = lda_model$fit_transform(x = dtm, n_iter = 1000,convergence_tol = 0.001, n_check_convergence = 25,progressbar = FALSE)



doc_topic_distr <- doc_topic_distr %>%
  data.frame() %>%
  mutate(id =  text2$uniqueID) %>%
  as_tibble() %>%
  gather(topic,theta,-id) %>%
  mutate(topic = gsub('X','',topic))

textToMerge <- text %>%
  mutate(uniqueID = paste0(docID,opensecretsID,rwID)) %>%
  left_join(doc_topic_distr %>%
              spread(topic,theta,sep = '70Grped_') %>%
              rename(uniqueID = id))

utterance_level <- utterance_level %>%
  left_join(textToMerge %>%
              select(docID,opensecretsID,fullInd,matches('topic70Grped')))

# Calculate speaker topics instead
text <- utterance_level %>%
  select(docID,fullInd,opensecretsID,textclean,nchars) %>%
  group_by(docID,opensecretsID) %>%
  summarise(textclean = paste(textclean,collapse = '. ')) %>%
  ungroup() %>%
  mutate(textclean = lemmatize_strings(prep_fun(textclean)),
         uniqueID = paste0(docID,opensecretsID))

inds <- sample(1:nrow(text),size = round(nrow(text)*.8),replace = F)
train <- text %>%
  slice(inds)
test <- text %>%
  slice(-inds)

tmRes_spkr <- topicModel_fun(train,test,text = 'textclean',id = 'uniqueID')

tmRes_spkr$perplexity %>%
  ggplot(aes(x = k,y = perplexity)) + 
  geom_point() + 
  geom_line() + 
  geom_text(aes(label = k))

tmRes_spkr$coherence %>%
  # filter(k > 20) %>%
  filter(!is.infinite(mean_logratio)) %>%
  group_by(k) %>%
  # slice_max(mean_logratio,n = 100) %>%
  summarise_all(mean,na.rm=T) %>%
  ggplot(aes(x = k,y = mean_logratio)) + 
  geom_point() + 
  geom_line() + 
  geom_vline(xintercept = c(70)) + 
  ylab('Coherence (avg. log ratio)')

save(tmRes_spkr,file = '../output/topic_robustness_JOPRR1_spkr.RData')


# Choosing the best
it = itoken(text$textclean,ids = text$uniqueID,progressbar = FALSE)
v = create_vocabulary(it,ngram = c(1,2),stopwords = c(gsub("'",'',stopwords())))
v = prune_vocabulary(v,doc_proportion_max = .5,term_count_min = 10)
vectorizer = vocab_vectorizer(v)
dtm = create_dtm(it,vectorizer)
lda_model = LDA$new(n_topics = 70, doc_topic_prior = 0.1, topic_word_prior = 0.01)
doc_topic_distr = lda_model$fit_transform(x = dtm, n_iter = 1000,convergence_tol = 0.001, n_check_convergence = 25,progressbar = FALSE)


doc_topic_distr <- doc_topic_distr %>%
  data.frame() %>%
  mutate(id =  text$uniqueID) %>%
  as_tibble() %>%
  gather(topic,theta,-id) %>%
  mutate(topic = gsub('X','',topic))

textToMerge <- text %>%
  left_join(doc_topic_distr %>%
              spread(topic,theta,sep = '70Spkr_') %>%
              rename(uniqueID = id))

utterance_level <- utterance_level %>%
  left_join(textToMerge %>%
              select(docID,opensecretsID,matches('topic70Spkr')))







# STM version
require(stm)
require(tm)
text <- utterance_level %>%
  mutate(textclean = lemmatize_strings(prep_fun(textclean)),
         interrupted = ifelse(grepl('--$',text),1,0)) %>%
  filter(!is.na(speaker))  %>% 
  select(textclean,chamber,opensecretsID,position,party,nKids,nDaughters,nSons,votepct_rel,gender,age,seniority,nominate_dim1,fullInd,
         interrupted)

text$party[which(is.na(text$party))] <- text$position[which(is.na(text$party))]
text <- text %>%
  mutate(comparisons = ifelse(grepl('FED',opensecretsID),opensecretsID,
                              ifelse(chamber == 'Senate',paste0('Senate: ',party),
                                     paste0('House: ',party))))
text %>%
  filter(!grepl(': ',comparisons)) %>%
  count(comparisons,opensecretsID)
  
processed <- textProcessor(documents = text$textclean,metadata = text)
out <- prepDocuments(processed$documents,processed$vocab,processed$meta)
fit <- stm(documents = out$documents,vocab = out$vocab,
           K = 100,prevalence = ~ comparisons + interrupted + s(nominate_dim1) + age + seniority + gender,
           max.em.its = 75,data = out$meta,init.type = 'LDA')

prep <- estimateEffect(1:100 ~ comparisons + interrupted + s(nominate_dim1) + age + seniority + gender,
                       fit,metadata = out$meta,uncertainty = 'Global')


save(prep,fit,out,file = '../output/stm_results.RData')

save(utterance_level,speaker_level,file = '../data/finalData_70kGrped.RData')



